#!/bin/bash

export DEVICE_ID=2
export ASCEND_DEVICE_ID=2
export ASCEND_RT_VISIBLE_DEVICES=2
export ASCEND_VISIBLE_DEVICES=2

vllm serve /data/model/microsoft/Fara-7B/ \
  --served-model-name fara \
  --host 0.0.0.0 \
  --port 8000 \
  --tensor-parallel-size 1 \
  --dtype float16 \
  --compilation-config '{"custom_ops":["none", "+rms_norm", "+rotary_embedding"]}' \
  --max-num-seqs 16 \
  --max-model-len 2048 \
  --gpu-memory-utilization 0.8